#!/usr/bin/env python3

import re
import os
import sys
import requests
import time
import html
import threading
import argparse

from markdownify import markdownify
from bs4 import BeautifulSoup

# Proxy for download
proxies = {}

# Download from raw url
enable_raw = True

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.66"
}

session = requests.Session()


def get(url):
    url = "{}{}".format("https://github.com", url) if "https://" not in url else url

    for i in range(5):
        try:
            time.sleep(0.5 * i)
            response = session.get(
                url,
                headers=headers,
                proxies=proxies,
            )
            if response.status_code % 100 < 4:
                return response
        except:
            pass
    print("Cannot get {}".format(url))


def get_file_path_url(url, dir_path):
    soup = BeautifulSoup(get(url).text, "html.parser")
    for div in soup.select('div[role="rowheader"]'):
        a = div.select_one("a")
        file_name = a.text.strip()
        url = a.get("href")
        if file_name != ".." and file_name != ". .":
            yield dir_path + "/" + file_name, url


def is_file(file_html):
    soup = BeautifulSoup(file_html, "html.parser")
    return soup.select_one("#raw-url")


def parse_readme(html, file_type):
    if file_type not in ["md", "rst"]:
        return

    html = (
        re.search("<article.*?>(.*?)</article>", html, re.S)
        .groups()[0]
        .replace("\n", "")
    )

    if file_type == "md":
        return markdownify(html)

    if file_type == "rst":
        return BeautifulSoup(html, "html.parser").text


def download_file(file_path, url):
    response = get(url)

    if not response:
        print("Download {} Failed".format(file_path))
        return

    file_html = response.text
    if not is_file(file_html):
        return download_dir(file_path, url)

    print("Downloading {}".format(file_path))
    if enable_raw:
        raw_url = BeautifulSoup(file_html, "html.parser").select_one("#raw-url")
        response = get(raw_url["href"])
        if not response:
            print(
                "download {} failed with raw, please try with parse".format(file_path)
            )
            return
        content = response.content
    else:
        file_type = file_path.split(".")[-1]
        content = parse_readme(file_html, file_type)
        if not content:
            table_html = "\n".join(
                re.findall('(<td id="LC\d+".*?>.*?</td>)', file_html, re.S)
            )
            soup = BeautifulSoup("<pre>{}</pre>".format(table_html), "html.parser")
            content = "\n".join(
                [
                    td.text if td.text.strip("\t ") != "\n" else ""
                    for td in soup.select("td")
                ]
            )
        content = content.encode("utf8")

    with open(file_path, "wb") as f:
        f.write(content)
    print("Downloaded {}".format(file_path))


def download_dir(dir_path, url):
    response = get(url)
    if is_file(response.text):
        threading.Thread(target=download_file, args=(dir_path, url)).start()
    else:
        os.makedirs(dir_path, exist_ok=True)
        for file_path, url in get_file_path_url(url, dir_path):
            threading.Thread(target=download_file, args=(file_path, url)).start()


class MyParse(argparse.ArgumentParser):
    def print_help(self, file=None):
        example_str = """
Example: 
  1. download by raw url: gitd -u \"https://github.com/twfb/git-directory-download\"
  2. download by raw url: gitd -r -u \"https://github.com/twfb/git-directory-download\"
  3. dowmload by parsing: gitd -p -u \"https://github.com/twfb/git-directory-download\"
  4. download by raw url with proxy: gitd -r -u \"https://github.com/twfb/git-directory-download\" --proxy "socks5://127.0.0.1:7891"
  
"""
        super().print_help(file)
        print(example_str)


def main():
    global enable_raw
    global proxies

    parser = MyParse()

    parser.add_argument(
        "-u",
        "--url",
        help='github url, split by ",", example: "https://x, http://y"',
    )

    parser.add_argument(
        "-r",
        "--raw",
        help="download from raw url",
        action="store_true",
    )

    parser.add_argument(
        "-p",
        "--parse",
        help="download by parsing html",
        action="store_true",
    )

    parser.add_argument(
        "--proxy",
        help='proxy config, example "socks5://127.0.0.1:7891"',
    )
    args = parser.parse_args()
    enable_raw = args.raw or (not args.parse)
    proxies = {"http": args.proxy, "https": args.proxy}
    if not args.url:
        parser.print_help()
        return

    for dir_url in args.url.split(","):
        dir_url = dir_url.strip().strip("/")
        dir_index = dir_url.rindex("/") + 1
        dir_path = dir_url[dir_index:]
        download_dir(dir_path, dir_url)


if __name__ == "__main__":
    main()
